# _*_coding:utf-8_*_
__author__ = 'gerry'

from sklearn.feature_extraction import DictVectorizer
import csv
from sklearn import preprocessing
from sklearn import tree
from sklearn import tree
from sklearn.externals.six import StringIO
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report


#read in the csv file and put features in a list of dist
allMedicalData = open(r'Medical.csv','rb') #read the data
reader  = csv.reader(allMedicalData)
print(type(reader))

data = []
labels = []
for row in reader:
    data.append(row[2:len(row)])
    labels.append(row[1])

X = np.array(data)
y= np.array(labels)

#拆分训练数据和测试数据
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

#使用信息熵作为划分标准，对决策树进行训练
clf  = tree.DecisionTreeClassifier(criterion="entropy")
print(clf)
clf.fit(x_train,y_train)
#把决策树写入文件
with open("tree.dot",'w') as f:
    f = tree.export_graphviz(clf,out_file=f)

#系数反应每个特征的影响力，越大表示该特征在分类中起到的作用越大
print(clf.feature_importances_)
#测试结果打印
answer = clf.predict(x_train)
print(x_train)
print(answer)
print(y_train)
#print(np.mean(answer == y_train))
#准确率和召回率
# precision,recall,thresholds = precision_recall_curve(y_train,clf.predict(x_train))
# answer = clf.predict_proba(X)[:,1]

































